{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 作业四：\n",
    "在https://www.kaggle.com/c/titanic Kaggle网站上下载和处理泰坦尼克数据集 15分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据读取\n",
    "data_s=pd.read_csv('train.csv')\n",
    "data_s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 文件headers解释\n",
    "- PassengerId:乘客ID\n",
    "- Survived:是否存活\n",
    "- Pclass:仓位等级\n",
    "- Name:姓名\n",
    "- Sex:性别\n",
    "- Age:年龄\n",
    "- SibSp:(sibling+spouse):兄弟姐妹与配偶\n",
    "- Parch:(parents+children):父母与子女\n",
    "- Ticket:票号\n",
    "- Fare:票价\n",
    "- Cabin:仓位\n",
    "- Embarked:登船港口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null object\n",
      "Age            714 non-null float64\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       889 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.6+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 文件信息\n",
    "- 总共有891行信息\n",
    "    - 其中Age714行信息，有缺失，后面的处理方法待定\n",
    "    - Cabin：乘客所在船舱，后续根据pclass，fare是否能补充\n",
    "    - Embarked：登船港口缺失两个值，考虑是否忽略数据缺失\n",
    "    - 其他数据不缺失值\n",
    "    - Name,Sex,Ticket,Cabin,Embarked都是object值\n",
    "        - 后续Sex可以转换为独热编码\n",
    "        - PassageId，Name，Ticket都是用来索引数据的，可以不考虑\n",
    "        - "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    549\n",
       "1    342\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 数据集891人，活下来了342个人，549人没有活下来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    491\n",
       "1    216\n",
       "2    184\n",
       "Name: Pclass, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 仓位等级 来查看存活 是否存在关系\n",
    "data_s['Pclass'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    136\n",
       "0     80\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==1]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    97\n",
       "1    87\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==2]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    372\n",
       "1    119\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==3]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 数据集中各等级活下来的人\n",
    "- 等级为一的人：216；活下来的人：136个，没有活下来的人：80 个\n",
    "- 等级为二的人：184；活下来的人：87 个，没有活下来的人：97 个\n",
    "- 等级为三的人：491；活下来的人：119个，没有活下来的人：372个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "80.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Age'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Age'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>803</th>\n",
       "      <td>804</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Thomas, Master. Assad Alexander</td>\n",
       "      <td>male</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2625</td>\n",
       "      <td>8.5167</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                             Name   Sex  \\\n",
       "803          804         1       3  Thomas, Master. Assad Alexander  male   \n",
       "\n",
       "      Age  SibSp  Parch Ticket    Fare Cabin Embarked  \n",
       "803  0.42      0      1   2625  8.5167   NaN        C  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Age']==0.42]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>630</th>\n",
       "      <td>631</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
       "      <td>male</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>27042</td>\n",
       "      <td>30.0</td>\n",
       "      <td>A23</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                  Name  \\\n",
       "630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   \n",
       "\n",
       "      Sex   Age  SibSp  Parch Ticket  Fare Cabin Embarked  \n",
       "630  male  80.0      0      0  27042  30.0   A23        S  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Age']==80]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    608\n",
       "1    209\n",
       "2     28\n",
       "4     18\n",
       "3     16\n",
       "8      7\n",
       "5      5\n",
       "Name: SibSp, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#兄弟姐妹\n",
    "data_s['SibSp'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "891"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "sum(data_s['SibSp'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>160</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Master. Thomas Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>180</th>\n",
       "      <td>181</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Constance Gladys</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>202</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. Frederick</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>324</th>\n",
       "      <td>325</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. George John Jr</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>792</th>\n",
       "      <td>793</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Stella Anna</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>846</th>\n",
       "      <td>847</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. Douglas Bullen</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>864</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Dorothy Edith \"Dolly\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                               Name     Sex  \\\n",
       "159          160         0       3         Sage, Master. Thomas Henry    male   \n",
       "180          181         0       3       Sage, Miss. Constance Gladys  female   \n",
       "201          202         0       3                Sage, Mr. Frederick    male   \n",
       "324          325         0       3           Sage, Mr. George John Jr    male   \n",
       "792          793         0       3            Sage, Miss. Stella Anna  female   \n",
       "846          847         0       3           Sage, Mr. Douglas Bullen    male   \n",
       "863          864         0       3  Sage, Miss. Dorothy Edith \"Dolly\"  female   \n",
       "\n",
       "     Age  SibSp  Parch    Ticket   Fare Cabin Embarked  \n",
       "159  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "180  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "201  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "324  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "792  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "846  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "863  NaN      8      2  CA. 2343  69.55   NaN        S  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['SibSp']==8]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### Sage 一家没有活下来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    678\n",
       "1    118\n",
       "2     80\n",
       "5      5\n",
       "3      5\n",
       "4      4\n",
       "6      1\n",
       "Name: Parch, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 父母与子女\n",
    "data_s['Parch'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    65\n",
       "0    53\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==1]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    40\n",
       "0    40\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==2]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 家庭成员的数据解释\n",
    "- 家里有一个成员的，活下来了65个，没活下来的有53个\n",
    "- 家里有两个成员的，活下来了40个，没活下来的有40个\n",
    "- 家里有三个成员的，活下来了3 个，没活下来的有2 个\n",
    "- 家里有四个成员的，活下来了  个，没活下来的有4 个\n",
    "- 家里有五个成员的，活下来了1 个，没活下来的有4 个\n",
    "- 家里有六个成员的，活下来了  个，他们都没有活下来，是Sage一家\n",
    "- 自己一个人登船的，活下来了223个，失去生命的445个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    3\n",
       "0    2\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==3]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==4]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4\n",
       "1    1\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==5]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==6]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    445\n",
       "1    233\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Parch']==0]['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    468\n",
       "1    109\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 男性，女性\n",
    "data_s[data_s['Sex']=='male']['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    233\n",
       "0     81\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Sex']=='female']['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "总共有891人，其中女性有314，活下来的有233，存活比例74.2%\n",
      "总共有891人，其中男性有577，活下来的有109，存活比例18.9%\n"
     ]
    }
   ],
   "source": [
    "print('总共有891人，其中女性有{}，活下来的有{}，存活比例{:.1f}%'.format(len(data_s[data_s['Sex']=='female']),233,\n",
    "                                                   100*233/len(data_s[data_s['Sex']=='female']),))\n",
    "print('总共有891人，其中男性有{}，活下来的有{}，存活比例{:.1f}%'.format(len(data_s[data_s['Sex']=='male']),109,\n",
    "                                                   100*109/len(data_s[data_s['Sex']=='male'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "shuffle_index=np.random.permutation(891)\n",
    "X=data_s.drop('Survived',axis=1)\n",
    "y=data_s['Survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Pclass                                               Name  \\\n",
       "0            1       3                            Braund, Mr. Owen Harris   \n",
       "1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2            3       3                             Heikkinen, Miss. Laina   \n",
       "3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4            5       3                           Allen, Mr. William Henry   \n",
       "\n",
       "      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
       "0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  \n",
       "1  female  38.0      1      0          PC 17599  71.2833   C85        C  \n",
       "2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3  female  35.0      1      0            113803  53.1000  C123        S  \n",
       "4    male  35.0      0      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    1\n",
       "3    1\n",
       "4    0\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[0:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 各舱位票价：英镑\n",
    "- 头等舱：84.2\n",
    "- 二等舱：20.7\n",
    "- 三等舱：13.7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13.675550101832997"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==3]['Fare'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20.66218315217391"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==2]['Fare'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "84.15468749999992"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Pclass']==1]['Fare'].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Attention!!!\n",
    "\n",
    "- 对于任何一份数据都应该客观冷静处理，首先其是一份数据，先按照数据的处理流程去做，因为现在还是初学：学习阶段，不应该乱七八糟的理论掺杂进来，先做事\n",
    "- 下一个阶段是把这份数据处理好，这个是下个阶段，因为行百里者半九十，你做到前面处理好了，不能就以为做完了，下个阶段要开始理顺整个逻辑，因为作为要接收你的几分钟的胡扯，你应该对你的接收者稍微负责一点，做的有思想一点，做好看是之后的事情\n",
    "- 如果你真有能力了，再谈对数据的理解吧，因为每个正常人对单个数据的理解都有其角度，你的不一定就高过别人，所以别老哔哔。反而是随机过程，如果里面真有知识，或者能佐证你想要的观点，确实值得挖掘，不过还是不要随意表达自己幼稚的观点好了。\n",
    "- 埋头做事，别人提意见就听听，然后继续埋头做你的事。\n",
    "\n",
    "- 看看泰坦尼克号就知道了。\n",
    "    - 海事规则：救生艇乘纳1/3人；Titanic超过一半；\n",
    "    - 救生艇能救1178；最后上去了651\n",
    "    - 救生艇能装65人，但是救生艇上面不超过65人\n",
    "    - 左侧妇女儿童上，男性不可以；右侧妇女儿童完了，男性可以上\n",
    "    - Titanic估计两小时就沉没了，实际接近2小时40分钟\n",
    "    - 救生艇半空中就往下直接放\n",
    "    - "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  海难发生后的补救\n",
    "- 1913年11月12日，第一届海上生命安全国际大会在伦敦召开。大会通过了关于救生艇、无线电通讯、航行安全训练等数项提议，并讨论了冰山检测问题。\n",
    "- 1914年1月30日，在第一届海上生命安全国际大会上，国际冰山检测组织正式成立，直到今天它还在检测和报告大西洋与太平洋上可能威胁航船的冰山。\n",
    "- 新的规则要求，救生艇必须能容纳船上的所有人员。\n",
    "- 船员撤离手册在不同的船舶上稍有差异，但有几个原则：一是妇幼旅客先撤离；二是确保关掉所有可能导致二次事故（如漏油）的阀门，以及类似的操作；三是确保撤离后在海上漂泊的时候有足够的生存条件；具体到船员，生活服务的船员（大厨、服务生）先撤离，普通船员随后（按甲板和轮机分组混搭），船长最后撤离。\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 海难女性生还的优势？\n",
    "- 海难中妇女不存在生还优势：在18起海难中，只有2起事故女性存在生还比例优势：Birkenhand和泰坦尼克号\n",
    "- 如果船长下达了“妇女优先”的命令，女性生还的可能性会增大：如果船长严格奉行妇女和儿童优先逃生的原则，女性的生还率确实会提高，比没有遵循这一原则时高出7.3%。可见在灾难事故发生之时，一个有权威的领导者的态度可能会直接影响人们的行为。\n",
    "- "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 开始做作业\n",
    "- 数据分析\n",
    "    - 数据概览：done\n",
    "    - 测定基准：还没讲，我自己之后看视频慢慢补充进来\n",
    "    - 分析特征重要性：Sex(male+female)；Age(youth<16;adult(16<= <=60));the old(>60)；Pclass(123)；Family(SibSp+Parch)\n",
    "    - 数据展示（数据可视化）：我自己不喜欢可视化（但这个是学习过程，放下你的偏见）：暂定\n",
    "- 特征工程\n",
    "    - 数据清洗:done\n",
    "    - 提取新特征（Family（0：Alone+1：Family））:done\n",
    "    - 特征选取:done\n",
    "    - Final_data:done\n",
    "- 机器学习\n",
    "    - 算法选择\n",
    "    - 训练及评估\n",
    "    - 预测结果\n",
    "    - 人物画像\n",
    "- 解决疑问：\n",
    "    - 那么其他特征值分布比较离散的，对最终的存活率影响是怎样的呢？\n",
    "    - \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null object\n",
      "Age            714 non-null float64\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       889 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.6+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>829</th>\n",
       "      <td>830</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Stone, Mrs. George Nelson (Martha Evelyn)</td>\n",
       "      <td>female</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                       Name  \\\n",
       "61            62         1       1                        Icard, Miss. Amelie   \n",
       "829          830         1       1  Stone, Mrs. George Nelson (Martha Evelyn)   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  \n",
       "61   female  38.0      0      0  113572  80.0   B28      NaN  \n",
       "829  female  62.0      0      0  113572  80.0   B28      NaN  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Embarked'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>55</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Ostby, Mr. Engelhart Cornelius</td>\n",
       "      <td>male</td>\n",
       "      <td>65.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>113509</td>\n",
       "      <td>61.9792</td>\n",
       "      <td>B30</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0000</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>167</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chibnall, Mrs. (Edith Martha Bowerman)</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>113505</td>\n",
       "      <td>55.0000</td>\n",
       "      <td>E33</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>252</th>\n",
       "      <td>253</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Stead, Mr. William Thomas</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113514</td>\n",
       "      <td>26.5500</td>\n",
       "      <td>C87</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>352</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Williams-Lambert, Mr. Fletcher Fellows</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113510</td>\n",
       "      <td>35.0000</td>\n",
       "      <td>C128</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                    Name  \\\n",
       "54            55         0       1          Ostby, Mr. Engelhart Cornelius   \n",
       "61            62         1       1                     Icard, Miss. Amelie   \n",
       "166          167         1       1  Chibnall, Mrs. (Edith Martha Bowerman)   \n",
       "252          253         0       1               Stead, Mr. William Thomas   \n",
       "351          352         0       1  Williams-Lambert, Mr. Fletcher Fellows   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket     Fare Cabin Embarked  \n",
       "54     male  65.0      0      1  113509  61.9792   B30        C  \n",
       "61   female  38.0      0      0  113572  80.0000   B28      NaN  \n",
       "166  female   NaN      0      1  113505  55.0000   E33        S  \n",
       "252    male  62.0      0      0  113514  26.5500   C87        S  \n",
       "351    male   NaN      0      0  113510  35.0000  C128        S  "
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Ticket'].str.contains(r'1135')].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Embarked数据补充\n",
    "- 根据取Cabin为B28附近的港口，S\\C都有，所以暂时不能确定\n",
    "- 根据https://blog.csdn.net/dujiahei/article/details/85837675 取Ticket前面有1135的，发觉登录港口：S\\C也都有，暂不确定\n",
    "- 其他特征不能表达：选定Cabin最近的B30：登录港口是C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s.loc[61,'Embarked']='C'\n",
    "data_s.loc[829,'Embarked']='C'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n",
       "Index: []"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Embarked'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "labels [(829, 'Embarked')] not contained in axis",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-58-41bc077e1669>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# data_s=data_s.drop((61,'Embarked'),axis=1)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdata_s\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdata_s\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m829\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'Embarked'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mdata_s\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mdrop\u001b[1;34m(self, labels, axis, level, inplace, errors)\u001b[0m\n\u001b[0;32m   2159\u001b[0m                 \u001b[0mnew_axis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2160\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2161\u001b[1;33m                 \u001b[0mnew_axis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2162\u001b[0m             \u001b[0mdropped\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0maxis_name\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mnew_axis\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2163\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mdrop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m   3622\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;34m'ignore'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3623\u001b[0m                 raise ValueError('labels %s not contained in axis' %\n\u001b[1;32m-> 3624\u001b[1;33m                                  labels[mask])\n\u001b[0m\u001b[0;32m   3625\u001b[0m             \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m~\u001b[0m\u001b[0mmask\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3626\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: labels [(829, 'Embarked')] not contained in axis"
     ]
    }
   ],
   "source": [
    "# data_s=data_s.drop((61,'Embarked'),axis=1)\n",
    "data_s=data_s.drop((829,'Embarked'),axis=1)\n",
    "data_s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Cabin数据填充\n",
    "- 去看了CSV数据：Cabin一样，Ticket不一样的数据有23条；Cabin一样，Ticket数据一样的数据82条：如果后续想使用Cabin数据，就按照Ticket相同，Cabin值一样进行补充，暂时不需要确定\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Age数据补充\n",
    "- youth<16;adult:16<= <=60;the old >60\n",
    "- 名字包含Master全部为男性，且年龄<12岁；之后归类为youth；目前按照年龄15填入\n",
    "- 名字包含Miss的跨度较大：0-63岁；所以不采用姓名填写；按照所有能确定年龄的占比填入\n",
    "- 在Age这个的精确性就要求这样吧，否则花费过多的精力了就\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files (x86)\\anconda\\lib\\site-packages\\ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>66</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Moubarek, Master. Gerios</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2661</td>\n",
       "      <td>15.2458</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>160</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Master. Thomas Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.5500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>177</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lefebre, Master. Henry Forbes</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>4133</td>\n",
       "      <td>25.4667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>709</th>\n",
       "      <td>710</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Moubarek, Master. Halim Gonios (\"William George\")</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2661</td>\n",
       "      <td>15.2458</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "65            66         1       3   \n",
       "159          160         0       3   \n",
       "176          177         0       3   \n",
       "709          710         1       3   \n",
       "\n",
       "                                                  Name   Sex Age  SibSp  \\\n",
       "65                            Moubarek, Master. Gerios  male NaN      1   \n",
       "159                         Sage, Master. Thomas Henry  male NaN      8   \n",
       "176                      Lefebre, Master. Henry Forbes  male NaN      3   \n",
       "709  Moubarek, Master. Halim Gonios (\"William George\")  male NaN      1   \n",
       "\n",
       "     Parch    Ticket     Fare Cabin Embarked  \n",
       "65       1      2661  15.2458   NaN        C  \n",
       "159      2  CA. 2343  69.5500   NaN        S  \n",
       "176      1      4133  25.4667   NaN        S  \n",
       "709      1      2661  15.2458   NaN        C  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s['Name'].str.contains(r'Master')][data_s['Age'].isnull()==True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Cannot setitem on a Categorical with a new category, set the categories first",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-55-596d83e74c12>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdata_s\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m65\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m159\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m176\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m709\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'Age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'15'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m    177\u001b[0m             \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_apply_if_callable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    178\u001b[0m         \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_setitem_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 179\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_setitem_with_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    180\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    181\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_has_valid_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer\u001b[1;34m(self, indexer, value)\u001b[0m\n\u001b[0;32m    586\u001b[0m                 \u001b[1;31m# scalar\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    587\u001b[0m                 \u001b[1;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 588\u001b[1;33m                     \u001b[0msetter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    589\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    590\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36msetter\u001b[1;34m(item, v)\u001b[0m\n\u001b[0;32m    511\u001b[0m                     \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    512\u001b[0m                     \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 513\u001b[1;33m                     \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetitem\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    514\u001b[0m                     \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_maybe_update_cacher\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mclear\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    515\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36msetitem\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m   3201\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3202\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0msetitem\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3203\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'setitem'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3204\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3205\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mputmask\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36mapply\u001b[1;34m(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)\u001b[0m\n\u001b[0;32m   3089\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3090\u001b[0m             \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'mgr'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3091\u001b[1;33m             \u001b[0mapplied\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3092\u001b[0m             \u001b[0mresult_blocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_extend_blocks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mapplied\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresult_blocks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3093\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36msetitem\u001b[1;34m(self, indexer, value, mgr)\u001b[0m\n\u001b[0;32m    756\u001b[0m             \u001b[1;31m# set\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    757\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 758\u001b[1;33m                 \u001b[0mvalues\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    759\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    760\u001b[0m             \u001b[1;31m# coerce and try to infer the dtypes of the result\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files (x86)\\anconda\\lib\\site-packages\\pandas\\core\\categorical.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m   1749\u001b[0m         \u001b[1;31m# something to np.nan\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1750\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mto_add\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misnull\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mto_add\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1751\u001b[1;33m             raise ValueError(\"Cannot setitem on a Categorical with a new \"\n\u001b[0m\u001b[0;32m   1752\u001b[0m                              \"category, set the categories first\")\n\u001b[0;32m   1753\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: Cannot setitem on a Categorical with a new category, set the categories first"
     ]
    }
   ],
   "source": [
    "data_s.loc[(65,159,176,709),'Age']='15'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s.loc[65,'Age']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24.00    30\n",
       "22.00    27\n",
       "18.00    26\n",
       "19.00    25\n",
       "30.00    25\n",
       "28.00    25\n",
       "21.00    24\n",
       "25.00    23\n",
       "36.00    22\n",
       "29.00    20\n",
       "32.00    18\n",
       "27.00    18\n",
       "35.00    18\n",
       "26.00    18\n",
       "16.00    17\n",
       "31.00    17\n",
       "20.00    15\n",
       "33.00    15\n",
       "23.00    15\n",
       "34.00    15\n",
       "39.00    14\n",
       "17.00    13\n",
       "42.00    13\n",
       "40.00    13\n",
       "45.00    12\n",
       "38.00    11\n",
       "50.00    10\n",
       "2.00     10\n",
       "4.00     10\n",
       "47.00     9\n",
       "         ..\n",
       "71.00     2\n",
       "59.00     2\n",
       "63.00     2\n",
       "0.83      2\n",
       "30.50     2\n",
       "70.00     2\n",
       "57.00     2\n",
       "0.75      2\n",
       "13.00     2\n",
       "10.00     2\n",
       "64.00     2\n",
       "40.50     2\n",
       "32.50     2\n",
       "45.50     2\n",
       "20.50     1\n",
       "24.50     1\n",
       "0.67      1\n",
       "14.50     1\n",
       "0.92      1\n",
       "74.00     1\n",
       "34.50     1\n",
       "80.00     1\n",
       "12.00     1\n",
       "36.50     1\n",
       "53.00     1\n",
       "55.50     1\n",
       "70.50     1\n",
       "66.00     1\n",
       "23.50     1\n",
       "0.42      1\n",
       "Name: Age, Length: 88, dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 177 entries, 5 to 888\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    177 non-null int64\n",
      "Survived       177 non-null int64\n",
      "Pclass         177 non-null int64\n",
      "Name           177 non-null object\n",
      "Sex            177 non-null object\n",
      "Age            0 non-null float64\n",
      "SibSp          177 non-null int64\n",
      "Parch          177 non-null int64\n",
      "Ticket         177 non-null object\n",
      "Fare           177 non-null float64\n",
      "Cabin          19 non-null object\n",
      "Embarked       177 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 18.0+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s[data_s['Age'].isnull()].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s_Age=data_s.dropna(subset=['Age'],axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files (x86)\\anconda\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "data_s_Age['Age']=data_s_Age['Age'].astype('float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 714 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    714 non-null int64\n",
      "Survived       714 non-null int64\n",
      "Pclass         714 non-null int64\n",
      "Name           714 non-null object\n",
      "Sex            714 non-null object\n",
      "Age            714 non-null float64\n",
      "SibSp          714 non-null int64\n",
      "Parch          714 non-null int64\n",
      "Ticket         714 non-null object\n",
      "Fare           714 non-null float64\n",
      "Cabin          185 non-null object\n",
      "Embarked       714 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 72.5+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s_Age.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files (x86)\\anconda\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "data_s_Age['Age']=pd.cut(data_s_Age['Age'],bins=[0,16,60,np.inf],labels=['youth','adult','the old'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "adult      592\n",
       "youth      100\n",
       "the old     22\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_Age['Age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "youth:13.6%\n",
      "adult:82.9%\n",
      "the old:3.1%\n"
     ]
    }
   ],
   "source": [
    "print('youth:{:.1f}%'.format(100*97/len(data_s_Age['Age'])))\n",
    "print('adult:{:.1f}%'.format(100*592/len(data_s_Age['Age'])))\n",
    "print('the old:{:.1f}%'.format(100*22/len(data_s_Age['Age'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "module 'pandas' has no attribute 'isna'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-49-7fb7baaffe3a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdata_s_null\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwhere\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misna\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_s\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdata_s\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m*\u001b[0m\u001b[1;36m0.135\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m15\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'isna'"
     ]
    }
   ],
   "source": [
    "data_s_null=np.where(pd.isna(data_s['Age']))[0]\n",
    "data_s['Age'].loc[data_s_null[0:int(len(data_s_null)*0.135)]]=15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'data_s_null' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-50-dc1ecd094b68>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdata_s\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m*\u001b[0m\u001b[1;36m0.135\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0.967\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m40\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'data_s_null' is not defined"
     ]
    }
   ],
   "source": [
    "data_s['Age'].loc[data_s_null[int(len(data_s_null)*0.135):int(0.967*len(data_s_null))]]=40"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'data_s_null' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-51-b331964743a0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdata_s\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0.967\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_s_null\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m65\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'data_s_null' is not defined"
     ]
    }
   ],
   "source": [
    "data_s['Age'].loc[data_s_null[int(0.967*len(data_s_null)):]]=65"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s['Age']=data_s['Age'].astype('float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s['Age']=pd.cut(data_s['Age'],bins=[0,16,60,np.inf],labels=[0,1,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male   1      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   1      1   \n",
       "2                             Heikkinen, Miss. Laina  female   1      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   1      1   \n",
       "4                           Allen, Mr. William Henry    male   1      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 添加一个新的特征Family\n",
    "- 如果SibSp+Parch=0；则为Alone：记为0\n",
    "- 否则记为Family：记为1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      1\n",
       "1      1\n",
       "2      0\n",
       "3      1\n",
       "4      0\n",
       "      ..\n",
       "886    0\n",
       "887    0\n",
       "888    3\n",
       "889    0\n",
       "890    0\n",
       "Name: Family, Length: 891, dtype: int64"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Family']=data_s['SibSp']+data_s['Parch']\n",
    "data_s['Family']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_s['Family'][data_s['Family']>1]=1\n",
    "data_s['Family'][data_s['Family']==0]=0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_s['Sex'][data_s['Sex']=='male']=1\n",
    "data_s['Sex'][data_s['Sex']=='female']=0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    699\n",
       "0    168\n",
       "2     24\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    577\n",
       "0    314\n",
       "Name: Sex, dtype: int64"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Sex'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 13 columns):\n",
      " #   Column       Non-Null Count  Dtype   \n",
      "---  ------       --------------  -----   \n",
      " 0   PassengerId  891 non-null    int64   \n",
      " 1   Survived     891 non-null    int64   \n",
      " 2   Pclass       891 non-null    int64   \n",
      " 3   Name         891 non-null    object  \n",
      " 4   Sex          891 non-null    object  \n",
      " 5   Age          891 non-null    category\n",
      " 6   SibSp        891 non-null    int64   \n",
      " 7   Parch        891 non-null    int64   \n",
      " 8   Ticket       891 non-null    object  \n",
      " 9   Fare         891 non-null    float64 \n",
      " 10  Cabin        204 non-null    object  \n",
      " 11  Embarked     891 non-null    object  \n",
      " 12  Family       891 non-null    int64   \n",
      "dtypes: category(1), float64(1), int64(6), object(5)\n",
      "memory usage: 84.6+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s['Sex']=data_s['Sex'].astype('int')\n",
    "data_s['Age']=data_s['Age'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 13 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    int32  \n",
      " 5   Age          891 non-null    int32  \n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     891 non-null    object \n",
      " 12  Family       891 non-null    int64  \n",
      "dtypes: float64(1), int32(2), int64(6), object(4)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Family</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.005007</td>\n",
       "      <td>-0.035144</td>\n",
       "      <td>0.042939</td>\n",
       "      <td>0.187961</td>\n",
       "      <td>-0.057527</td>\n",
       "      <td>-0.001652</td>\n",
       "      <td>0.012658</td>\n",
       "      <td>-0.057462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Survived</th>\n",
       "      <td>-0.005007</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.338481</td>\n",
       "      <td>-0.543351</td>\n",
       "      <td>-0.104620</td>\n",
       "      <td>-0.035322</td>\n",
       "      <td>0.081629</td>\n",
       "      <td>0.257307</td>\n",
       "      <td>0.203367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pclass</th>\n",
       "      <td>-0.035144</td>\n",
       "      <td>-0.338481</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.131900</td>\n",
       "      <td>-0.202339</td>\n",
       "      <td>0.083081</td>\n",
       "      <td>0.018443</td>\n",
       "      <td>-0.549500</td>\n",
       "      <td>-0.135207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sex</th>\n",
       "      <td>0.042939</td>\n",
       "      <td>-0.543351</td>\n",
       "      <td>0.131900</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.103940</td>\n",
       "      <td>-0.114631</td>\n",
       "      <td>-0.245489</td>\n",
       "      <td>-0.182333</td>\n",
       "      <td>-0.303646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Age</th>\n",
       "      <td>0.187961</td>\n",
       "      <td>-0.104620</td>\n",
       "      <td>-0.202339</td>\n",
       "      <td>0.103940</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.287102</td>\n",
       "      <td>-0.214659</td>\n",
       "      <td>0.055598</td>\n",
       "      <td>-0.225516</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SibSp</th>\n",
       "      <td>-0.057527</td>\n",
       "      <td>-0.035322</td>\n",
       "      <td>0.083081</td>\n",
       "      <td>-0.114631</td>\n",
       "      <td>-0.287102</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.414838</td>\n",
       "      <td>0.159651</td>\n",
       "      <td>0.584471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Parch</th>\n",
       "      <td>-0.001652</td>\n",
       "      <td>0.081629</td>\n",
       "      <td>0.018443</td>\n",
       "      <td>-0.245489</td>\n",
       "      <td>-0.214659</td>\n",
       "      <td>0.414838</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.216225</td>\n",
       "      <td>0.583398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fare</th>\n",
       "      <td>0.012658</td>\n",
       "      <td>0.257307</td>\n",
       "      <td>-0.549500</td>\n",
       "      <td>-0.182333</td>\n",
       "      <td>0.055598</td>\n",
       "      <td>0.159651</td>\n",
       "      <td>0.216225</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.271832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Family</th>\n",
       "      <td>-0.057462</td>\n",
       "      <td>0.203367</td>\n",
       "      <td>-0.135207</td>\n",
       "      <td>-0.303646</td>\n",
       "      <td>-0.225516</td>\n",
       "      <td>0.584471</td>\n",
       "      <td>0.583398</td>\n",
       "      <td>0.271832</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             PassengerId  Survived    Pclass       Sex       Age     SibSp  \\\n",
       "PassengerId     1.000000 -0.005007 -0.035144  0.042939  0.187961 -0.057527   \n",
       "Survived       -0.005007  1.000000 -0.338481 -0.543351 -0.104620 -0.035322   \n",
       "Pclass         -0.035144 -0.338481  1.000000  0.131900 -0.202339  0.083081   \n",
       "Sex             0.042939 -0.543351  0.131900  1.000000  0.103940 -0.114631   \n",
       "Age             0.187961 -0.104620 -0.202339  0.103940  1.000000 -0.287102   \n",
       "SibSp          -0.057527 -0.035322  0.083081 -0.114631 -0.287102  1.000000   \n",
       "Parch          -0.001652  0.081629  0.018443 -0.245489 -0.214659  0.414838   \n",
       "Fare            0.012658  0.257307 -0.549500 -0.182333  0.055598  0.159651   \n",
       "Family         -0.057462  0.203367 -0.135207 -0.303646 -0.225516  0.584471   \n",
       "\n",
       "                Parch      Fare    Family  \n",
       "PassengerId -0.001652  0.012658 -0.057462  \n",
       "Survived     0.081629  0.257307  0.203367  \n",
       "Pclass       0.018443 -0.549500 -0.135207  \n",
       "Sex         -0.245489 -0.182333 -0.303646  \n",
       "Age         -0.214659  0.055598 -0.225516  \n",
       "SibSp        0.414838  0.159651  0.584471  \n",
       "Parch        1.000000  0.216225  0.583398  \n",
       "Fare         0.216225  1.000000  0.271832  \n",
       "Family       0.583398  0.271832  1.000000  "
      ]
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_metric=data_s.corr()\n",
    "data_s_metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived       1.000000\n",
       "Fare           0.257307\n",
       "Family         0.203367\n",
       "Parch          0.081629\n",
       "PassengerId   -0.005007\n",
       "SibSp         -0.035322\n",
       "Age           -0.104620\n",
       "Pclass        -0.338481\n",
       "Sex           -0.543351\n",
       "Name: Survived, dtype: float64"
      ]
     },
     "execution_count": 197,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_metric['Survived'].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_s_new=data_s.drop(columns=['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Family</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  Sex  Age  SibSp  Parch     Fare  Family\n",
       "0         0       3    1    1      1      0   7.2500       1\n",
       "1         1       1    0    1      1      0  71.2833       1\n",
       "2         1       3    0    1      0      0   7.9250       0\n",
       "3         1       1    0    1      1      0  53.1000       1\n",
       "4         0       3    1    1      0      0   8.0500       0"
      ]
     },
     "execution_count": 202,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_new.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      False\n",
       "1       True\n",
       "2       True\n",
       "3       True\n",
       "4      False\n",
       "       ...  \n",
       "886    False\n",
       "887     True\n",
       "888    False\n",
       "889     True\n",
       "890    False\n",
       "Name: Survived, Length: 891, dtype: bool"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=data_s_new.drop('Survived',axis=1)\n",
    "target=(data_s_new['Survived']==1)\n",
    "target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "                     weights='uniform')"
      ]
     },
     "execution_count": 226,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kn_clf=KNeighborsClassifier()\n",
    "kn_clf.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 227,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_sample=data.iloc[1,:]\n",
    "data_s_sample\n",
    "y_s_sample=target.iloc[1]\n",
    "y_s_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True])"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kn_clf.predict([data_s_sample])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_score,recall_score\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import cross_val_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.73400673, 0.77104377, 0.76430976])"
      ]
     },
     "execution_count": 229,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_val_score(kn_clf,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred=cross_val_predict(kn_clf,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision:68.4%\n",
      "recall:67.8%\n"
     ]
    }
   ],
   "source": [
    "precision=precision_score(target,y_pred)\n",
    "recall=recall_score(target,y_pred)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score=nan,\n",
       "             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,\n",
       "                                            metric='minkowski',\n",
       "                                            metric_params=None, n_jobs=None,\n",
       "                                            n_neighbors=5, p=2,\n",
       "                                            weights='uniform'),\n",
       "             iid='deprecated', n_jobs=None,\n",
       "             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
       "                          'weights': ['uniform']},\n",
       "                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
       "                          'weights': ['distance']}],\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 233,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_grid=[\n",
    "    {'weights':['uniform'],'n_neighbors':[i for i in range(1,11)]},\n",
    "    {'weights':['distance'],'n_neighbors':[i for i in range(1,11)]},\n",
    "]\n",
    "final_kn_clf=GridSearchCV(kn_clf,param_grid,cv=3,\n",
    "                         scoring='accuracy')\n",
    "final_kn_clf.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,\n",
       "                     weights='distance')"
      ]
     },
     "execution_count": 234,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_kn_clf.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "kn_clf_new=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
    "                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,\n",
    "                     weights='distance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,\n",
       "                     weights='distance')"
      ]
     },
     "execution_count": 236,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kn_clf_new.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.74747475, 0.8013468 , 0.78451178])"
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_val_score(kn_clf_new,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_new=cross_val_predict(kn_clf_new,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision:72.4%\n",
      "recall:68.1%\n"
     ]
    }
   ],
   "source": [
    "precision=precision_score(target,y_pred_new)\n",
    "recall=recall_score(target,y_pred_new)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 418 entries, 0 to 417\n",
      "Data columns (total 11 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  418 non-null    int64  \n",
      " 1   Pclass       418 non-null    int64  \n",
      " 2   Name         418 non-null    object \n",
      " 3   Sex          418 non-null    object \n",
      " 4   Age          332 non-null    float64\n",
      " 5   SibSp        418 non-null    int64  \n",
      " 6   Parch        418 non-null    int64  \n",
      " 7   Ticket       418 non-null    object \n",
      " 8   Fare         417 non-null    float64\n",
      " 9   Cabin        91 non-null     object \n",
      " 10  Embarked     418 non-null    object \n",
      "dtypes: float64(2), int64(4), object(5)\n",
      "memory usage: 36.0+ KB\n"
     ]
    }
   ],
   "source": [
    "test=pd.read_csv('test.csv')\n",
    "test.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      False\n",
       "1       True\n",
       "2      False\n",
       "3      False\n",
       "4       True\n",
       "       ...  \n",
       "413    False\n",
       "414     True\n",
       "415    False\n",
       "416    False\n",
       "417    False\n",
       "Name: Survived, Length: 418, dtype: bool"
      ]
     },
     "execution_count": 247,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_test=pd.read_csv('gender_submission.csv')\n",
    "target_test=target_test['Survived']\n",
    "target_test=(target_test==1)\n",
    "target_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "test['Sex'][test['Sex']=='male']=1\n",
    "test['Sex'][test['Sex']=='female']=0\n",
    "test['Sex']=test['Sex'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n",
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "test['Family']=test['Parch']+test['SibSp']\n",
    "test['Family'][test['Family']==0]=0\n",
    "test['Family'][test['Family']>0]=1\n",
    "test['Family']=test['Family'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "test_null=np.where(pd.isna(test['Age']))[0]\n",
    "test['Age'].loc[test_null[0:int(len(test_null)*0.135)]]=15\n",
    "test['Age'].loc[test_null[int(len(test_null)*0.135):int(0.967*len(test_null))]]=40\n",
    "test['Age'].loc[test_null[int(0.967*len(test_null)):]]=65\n",
    "test['Age']=test['Age'].astype('float')\n",
    "test['Age']=pd.cut(test['Age'],bins=[0,16,60,np.inf],labels=[0,1,2])\n",
    "test['Age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_new=test.drop(columns=['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 418 entries, 0 to 417\n",
      "Data columns (total 7 columns):\n",
      " #   Column  Non-Null Count  Dtype   \n",
      "---  ------  --------------  -----   \n",
      " 0   Pclass  418 non-null    int64   \n",
      " 1   Sex     418 non-null    int32   \n",
      " 2   Age     418 non-null    category\n",
      " 3   SibSp   418 non-null    int64   \n",
      " 4   Parch   418 non-null    int64   \n",
      " 5   Fare    417 non-null    float64 \n",
      " 6   Family  418 non-null    int32   \n",
      "dtypes: category(1), float64(1), int32(2), int64(3)\n",
      "memory usage: 17.0 KB\n"
     ]
    }
   ],
   "source": [
    "test_new.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\pandas\\core\\indexing.py:670: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    }
   ],
   "source": [
    "test_new['Fare'].iloc[np.where(pd.isna(test_new['Fare']))]=test_new['Fare'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 418 entries, 0 to 417\n",
      "Data columns (total 7 columns):\n",
      " #   Column  Non-Null Count  Dtype   \n",
      "---  ------  --------------  -----   \n",
      " 0   Pclass  418 non-null    float64 \n",
      " 1   Sex     418 non-null    float64 \n",
      " 2   Age     418 non-null    category\n",
      " 3   SibSp   418 non-null    int64   \n",
      " 4   Parch   418 non-null    int64   \n",
      " 5   Fare    418 non-null    float64 \n",
      " 6   Family  418 non-null    int32   \n",
      "dtypes: category(1), float64(3), int32(1), int64(2)\n",
      "memory usage: 18.6 KB\n"
     ]
    }
   ],
   "source": [
    "test_new.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_new['Age']=test_new['Age'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_pred=cross_val_predict(kn_clf_new,test_new,target_test,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision:73.9%\n",
      "recall:76.3%\n"
     ]
    }
   ],
   "source": [
    "precision=precision_score(target_test,test_pred)\n",
    "recall=recall_score(target_test,test_pred)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 结论\n",
    "- K近邻没有选择最佳参数表现：\n",
    "    - precision:68.4%\n",
    "    - recall:67.8%\n",
    "- 选择最佳参数后模型表现\n",
    "    - precision:72.4%\n",
    "    - recall:68.1%\n",
    "- 测试集表现\n",
    "    - precision:73.9%\n",
    "    - recall:76.3%"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 我自己下去再调整模型吧，这个都好几天了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
